Reading in Data that Was Provided and Manually Calculated Through Python

load("org_2000_2024.RData")

# merge the state names using the "statefip" variable
url <- "https://github.com/wsundstrom/econ173/raw/data/statefips.dta"
states <- read_dta(url) %>% 
  select(statefip = statefips, state = statename, statecd)
org <- org %>% 
  left_join(states)




# Scrape the HTML tables on the following website using rvest package:
url <- "https://www.dol.gov/agencies/whd/state/minimum-wage/history"
# read in calculated values for percentage change
data <- read_excel("percentage_change_results.xlsx")
#### ^ this code was generated in Python to get percent changes for every occ2010

Placing Minimum Wage Data

# Get the tables - this results in a list because there are multiple tables
minwage <- url %>%
  read_html() %>%
  html_nodes("table") 
# the tables with the years we want (2000-) are the 3rd-6th 
minwage3 <- minwage %>%     
  .[[3]] %>%                  # 3rd table
  html_table() 
colnames(minwage3)[1] <- "state"
minwage4 <- minwage %>%     
  .[[4]] %>%                  
  html_table() 
colnames(minwage4)[1] <- "state"
minwage5 <- minwage %>%     
  .[[5]] %>%                  
  html_table() 
colnames(minwage5)[1] <- "state"
minwage6 <- minwage %>%     
  .[[6]] %>%                  
  html_table() 
colnames(minwage6)[1] <- "state"

# minwageL and minwageU are the upper and lower values for states with a range
mintab <- minwage3 %>% 
  left_join(minwage4) %>% 
  left_join(minwage5) %>% 
  left_join(minwage6) %>% 
  pivot_longer(-state, names_to = "year", values_to = "minwage") %>% 
  separate(minwage, into = c("minwageL", "minwageU"), sep = "-", remove = F) %>% 
  mutate(year = as.numeric(gsub("[^0-9]+", "", year)),
         minwageL = as.numeric(gsub("[^0-9.]+", "", minwageL, perl = TRUE)),
         minwageU = ifelse(is.na(minwageU), minwageL, as.numeric(gsub("[^0-9.]+", "", minwageU, perl = TRUE))),
         minwageL = ifelse(state=="Nevada" & year==2022, 9.50, minwageL),
         minwageU = ifelse(state=="Nevada" & year==2022, 10.50, minwageU),
         minwageL = ifelse(state=="Nevada" & year==2023, 10.25, minwageL),
         minwageU = ifelse(state=="Nevada" & year==2023, 11.25, minwageU)) %>%
  filter(!(state %in% c("Guam", "Puerto Rico", "U.S. Virgin Islands")))

# assign federal min if min is missing or fed > state min 
fedmin <- mintab %>% 
  filter(state=="Federal (FLSA)") %>% 
  mutate(fedmin = minwageL) %>% 
  select(year, fedmin)
mintab <- mintab %>% 
  left_join(fedmin) %>% 
  mutate(minwageL = ifelse(is.na(minwageL) | fedmin>minwageL, fedmin, minwageL),
         minwageU = ifelse(is.na(minwageU) | fedmin>minwageU, fedmin, minwageU)) %>% 
  filter(state!="Federal (FLSA)") %>% 
  select(-minwage) 

Cleaning Data and Making Education a Readable Format

org$hourwage <- ifelse(org$hourwage == 999.99, NA, org$hourwage)
org$hourwage2 <- ifelse(org$hourwage2 == 999.99, NA, org$hourwage2)
org$hourwage <- ifelse(org$hourwage == 999.99, NA, org$hourwage)
org$earnweek <- ifelse(org$earnweek == 9999.99, NA, org$earnweek)
org$earnweek2 <- ifelse(org$earnweek2 == 9999.99, NA, org$earnweek2)

org <- org %>%
  mutate(educ_level = case_when(
    educ == 2 ~ "None, preschool, or kingergarten",
    educ == 10 ~ "Grades 1, 2, 3, or 4",
    educ == 20 ~ "Grades 5 or 6",
    educ == 30 ~ "Grades 7 or 8",
    educ == 40 ~ "Grade 9",
    educ == 50 ~ "Grade 10",
    educ == 60 ~ "Grade 11",
    educ == 71 ~ "12th grade, no diploma",
    educ == 73 ~ "High school diploma or equivalent",
    educ == 81 ~ "Some college but no degree",
    educ == 91 ~ "Associate's degree, occupational/vocational",
    educ == 92 ~ "Associate's degree, academic program",
    educ == 111 ~ "Bachelor's degree",
    educ == 123 ~ "Master's degree",
    educ == 124 ~ "Professional school degree",
    educ == 125 ~ "Doctorate degree",
    TRUE ~ as.character(educ)  # keeps any existing values not matched
  ))  

org_minwage <- left_join(org, mintab, by = c("year", "state"))

org_minwage <- org_minwage %>%
  group_by(year, occ2010, statecd) %>%  # Group by year, occ2010, and statecd
  mutate(mean_hourwage = mean(hourwage, na.rm = TRUE)) %>%  # Calculate mean of hourwage
  ungroup()  # Ungroup to prevent grouped structure in the final dataframe

org_minwage$occ2010 <- ifelse(org_minwage$occ2010 == 9999, NA, org_minwage$occ2010)

Mutating org_minwage to calculate the mean hour wage for occupations dependent on year and state codes. While also removing NIU values for occ2010.


2006 - 2010 Recession

#### 2006-2010 recession

recession_period <- data %>%
  filter(year >= 2004 & year <= 2010) %>%
  filter(count > 500)

rec_period_loss <- recession_period %>%
  filter(year >= 2007 & year <= 2010) %>%
  filter(percentage_change <= -11.48)

rec_loss <- data %>%
  filter(occ2010 %in% c(4010, 3800, 4920, 8220, 5810, 6230, 6420, 8740, 7340, 9100)) %>%
  filter(year >= 2007 & year <= 2010)

rec_per_loss_y_over_y <- recession_period %>%
  filter(occ2010 %in% c(4010, 3800, 4920, 8220, 5810, 6230, 6420, 8740, 7340, 9100)) %>%
  filter(year >= 2007 & year <= 2010)

# Group by 'year' and calculate the mean of 'per_change' (or another aggregation)
avg_rec_per_loss <- recession_period %>%
  group_by(year, occ2010, count) %>%
  summarize(mean_per_change = mean(percentage_change, na.rm = TRUE))

Here the data is manipulated for specific observations of occ2010 that saw the largest overall percent decrease throughout the 2007-2010 recession. The counts are filtered to above 500 to minimize small differences between years that cause large percent changes. I.E. an occupation going from 3 observations to 2, which would be a -33.33% change.



Bar Charts for 2007-2010 Recession Period

ggplot(rec_loss, aes(x = year, y = percentage_change, fill = factor(occ2010))) +
  geom_bar(stat = "identity", position = "dodge") +  # Bar chart with dodge positioning
  labs(
    title = "Percentage Change by occ2010 Over Years",
    x = "Years",
    y = "Percentage Change",
    fill = "occ2010"
  ) +
  theme_minimal() +  # Use a clean theme
  theme(
    plot.title = element_text(hjust = 0.5, size = 16),
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 14)
  )

avg_perc_2006 <- rec_loss %>%
  group_by(occ2010) %>%             
  summarize(avg_percentage_change = mean(percentage_change))

ggplot(avg_perc_2006, aes(x = factor(occ2010), y = avg_percentage_change, fill = factor(occ2010))) +
  geom_bar(stat = "identity", position = "dodge") +  # Bar chart with dodge positioning
  
  labs(
    title = "Average Percentage Change by occ2010 
    Over 2007-2010 Recession",
    x = "occ2010",
    y = "Average Percentage Change",
    fill = "occ2010"
  ) +
  theme_minimal() +  # Use a clean theme
  theme(
    plot.title = element_text(hjust = 0.5, size = 16),
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 14)
  )

occ_lookup_rec <- data.frame(
  occ2010 = c(3800, 4010, 4920, 5810, 6230, 6420, 7340, 8220, 8740, 9100),
  occupation_name = c(
    "Sheriffs, Bailiffs, Correctional Officers, and Jailers", 
    "First-Line Supervisors of Food Preparation and Serving Workers", 
    "Real Estate Brokers and Sales Agents", 
    "Data Entry Keyers", 
    "Carpenters", 
    "Painters, Construction and Maintenance", 
    "Maintenance and Repair Workers, General", 
    "Metal Workers and Plastic Workers", 
    "Inspectors, Testers, Sorters, Samplers, and Weighers", 
    "Bus and Ambulance Drivers and Attendants"
  ),
  stringsAsFactors = FALSE
)

occ_table_rec <- occ_lookup_rec %>%
  select(occ2010, occupation_name) %>%     # Select relevant columns
  arrange(occ2010)                          # Sort by occ2010 if desired

# Create a styled HTML table
occ_table_rec %>%
  kable("html", caption = "Occupation Codes and Names for 2007-2010 Recession") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE)
Occupation Codes and Names for 2007-2010 Recession
occ2010 occupation_name
3800 Sheriffs, Bailiffs, Correctional Officers, and Jailers
4010 First-Line Supervisors of Food Preparation and Serving Workers
4920 Real Estate Brokers and Sales Agents
5810 Data Entry Keyers
6230 Carpenters
6420 Painters, Construction and Maintenance
7340 Maintenance and Repair Workers, General
8220 Metal Workers and Plastic Workers
8740 Inspectors, Testers, Sorters, Samplers, and Weighers
9100 Bus and Ambulance Drivers and Attendants

The first bar chart, titled “Percentage Change by occ2010 Over Years” displays the percentage change for each occupation of interest throughout the years of interest. The largest value decreases are seen in 2008 and 2009. This graph is labeled as “Figure 3” in the Google Doc. The second bar chart, titled “Average Percentage Change by occ2010 Over 2007-2010 Recession” displays the average percentage change for the occupations of interest throughout the years of interest. The occupations which noticed that largest decreases wee 4920, 5810, and 8220. This graph is labeled as “Figure 4” in the Google Doc. The table titled, “Occupation Codes and Names for 2007-2010 Recession” displays the occupations of interest and lines up the codes with the occupation names. This is labeled as “Figure 5” in the Google Doc.

Real Estate Brokers

real_estate <- rec_loss %>%
  filter(occ2010 %in% c(4920)) %>%
  filter(year >= 2005 & year <= 2010)

ggplot(real_estate, aes(x = year, y = percentage_change, color = factor(occ2010), group = occ2010)) +
  geom_line(size = 0.5) +  # Add lines for each occ2010 group
  
  labs(
    title = "Percentage Change by occ2010 Over Years",
    x = "Years",
    y = "Percentage Change",
    color = "occ2010"
  ) +
  theme_minimal() + # Use a clean theme
  theme(
    plot.title = element_text(hjust = 0.5, size = 16),
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 14)
  )

I decided to plot real estate brokers on their own from 2007-2010 as I noticed that in the 2007-2010 recession period that this was an occupation that was found to have a high percentage loss. The graph demonstrates the decrease in real estate brokers due to the housing bubble bursting. This is shown as Figure 6 in the Google Doc.


Setting Up Data for Plots

### Scatter Plot to Show the Minimum Wage Affecting Occupation Losses

org_minwage_2007 <- org_minwage%>%
  group_by(year, occ2010, statecd) %>%
  filter(occ2010 %in% c(4010, 3800, 4920, 8220, 5810, 6230, 6420, 8740, 7340, 9100)) %>%
  filter(year >= 2007 & year <= 2010) %>%
  mutate(
    within_150pct = ifelse(hourwage <= 1.5 * minwageU, 1, 0)  # Create binary variable
  ) %>%
  select(-cpsid,-month,-pernum,-wtfinl,-cpsidp,-earnweek2,-hourwage2,-age,-sex,-race,-nativity,-hispan,-empstat,-ind1990,-educ,-schlcoll,-earnweek,-earnwt,-educ_level) %>%
  filter(!is.na(within_150pct)) %>%
  filter(!is.na(hourwage))
  

org_minwage_2007_grouped <- org_minwage_2007 %>%
  group_by(occ2010) %>%  # Group by occ2010
  summarise(mean_within_150pct = mean(within_150pct, na.rm = TRUE), .groups = "drop")  # Calculate the mean

org_minwage_2007_plot <- org_minwage_2007_grouped %>%
  left_join(rec_period_loss, by = "occ2010")

Throughout this chunk of code I am looking for ways to manipulate the data into a way that will be portrayed best graphically. I take the hourwage and determine that if it is within 150% of the upper bound of the minimum wage the observation is assigned a 1. I then group the full dataset by occ2010 to get the average of the binary variable, for example: if occ2010, 4920 has 4000 observations, and 3000 of these observations are within the binary range. Then the math is 3000/4000 = the binary percentage which in this case is 0.75.


####Scatter Plots for Occupations of Interest in 2007-2010

ggplot_base <- ggplot(org_minwage_2007_plot, aes(x = mean_within_150pct, y = percentage_change, color = as.factor(occ2010))) +
  geom_point(aes(text = paste("Percent of occ2010 within 150% minimum wage:", mean_within_150pct)), size = 3) +  # Adds hover text
  labs(
    title = "Scatter Plot of Percentage Change vs. Mean Within 150%\n(2007-2010 Period)",
    x = "Mean Within 150%",
    y = "Percentage Change",
    color = "Occupation (occ2010)"
  ) +
  scale_y_reverse() +
  theme_minimal()

# Convert to interactive plotly plot
interactive_plot <- ggplotly(ggplot_base, tooltip = "text")  # Tooltip displays the text defined in aes
# Display the plot
interactive_plot
fit_specific_rec <- lm(percentage_change ~ mean_within_150pct, data = org_minwage_2007_plot)
slope_specific_rec <- coef(fit_specific_rec)[2]
Specific_Rec_Reg <- list("Linear Model"=fit_specific_rec)
modelsummary(Specific_Rec_Reg,
             coef_map = c("percentage_change"="Greatest Negative Percentage Change", "mean_within_150pct"="Within 150% of Minimum Wage"),
             stars=T, gof_map = c("nobs", "r.squared"),
             title = "Percentage Change of Employment based on being within 150% of Minimum Wage during the 2007-2010 Recession for only Occupations of Interest",
             output="default")
Percentage Change of Employment based on being within 150% of Minimum Wage during the 2007-2010 Recession for only Occupations of Interest
Linear Model
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001
Within 150% of Minimum Wage 2.728
(5.068)
Num.Obs. 10
R2 0.035
ggplot_base <- ggplot(org_minwage_2007_plot, aes(x = mean_within_150pct, y = percentage_change, color = as.factor(occ2010))) +
  geom_point(aes(text = paste("Percent of occ2010 within 150% minimum wage:", mean_within_150pct)), size = 3) +
  geom_smooth(aes(text = paste("Slope:", round(slope_specific_rec, 4))),
              method = "lm", se = FALSE, color = "black", linetype = "solid") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(title = "Scatter Plot of Percentage Change vs. Mean Within 150% (2007-2010) for occ2010 of Interest With Regression Line",
       x = "Mean Within 150%",
       y = "Percentage Change") +
  scale_y_reverse() +
  guides(color = "none") +
  theme_minimal()

interactive_plot <- ggplotly(ggplot_base, tooltip = "text")
interactive_plot

The two graphs share the exact same data, however the reason for them being plotted twice is because the first graph shows the spread of the percentage change in a more visually appealing way. While the graph with the regression line shrinks the distance on the y-axis showing the points very close to one another. The regression in the modelsummary table provides information that the closer the occupation is to the minimum wage for these occupations of interest, the more positive the percentage change. These are Figures 7 and 8 in the Google Doc. The table associated is labeled as Table 1 in the Google Doc.


####Data Manipulation for Total Occupations During 2007-2010 Recession Period

all_occ_2007 <- org_minwage %>%
  group_by(year, occ2010, statecd) %>%
  filter(year >= 2007 & year <= 2010) %>%
  mutate(
    within_150pct = ifelse(hourwage <= 1.5 * minwageU, 1, 0)  # Create binary variable
  ) %>%
  select(-cpsid,-month,-pernum,-wtfinl,-cpsidp,-earnweek2,-hourwage2,-age,-sex,-race,-nativity,-hispan,-empstat,-ind1990,-educ,-schlcoll,-earnweek,-earnwt,-educ_level) %>%
  filter(!is.na(within_150pct)) %>%
  filter(!is.na(hourwage))

all_occ_2007_a <- all_occ_2007 %>%
  group_by(occ2010, year)  # Group by occ2010
 
 all_occ_2007_a$occ2010 <- ifelse(all_occ_2007_a$occ2010 == 9999.99, NA, all_occ_2007_a$occ2010)


all_occ_2007_binary <- all_occ_2007 %>%
  group_by(occ2010) %>%
  summarise(mean_within_150pct = mean(within_150pct, na.rm = TRUE), .groups = "drop")  # Calculate the mean

total_per_change_rec <- data %>%
  filter(year >= 2007 & year <= 2010) %>%  # Filter data by year range first
  filter(count >= 500) %>%
  group_by(occ2010) %>%             # Group by year and occ2010
  filter(percentage_change == min(percentage_change, na.rm = TRUE)) %>%  # Keep rows with the smallest percentage_change
  ungroup()

all_occ_2007_b <- all_occ_2007_binary %>%
  left_join(total_per_change_rec %>% select(occ2010, percentage_change), 
            by = c("occ2010")) %>%
  filter(!is.na(percentage_change))

The data is manipulated to create a mean for the binary variable of all occ2010 values that have counts over 500 for the specific years of interest. Then I filter for the lowest percent change for the occ2010 values for the years of interest. Then I join the data all together to have the percentage change and mean of the binary variable to give a percentage of the occ2010 near the minimum wage threshold.


fit_rec <- lm(percentage_change ~ mean_within_150pct, data = all_occ_2007_b)
slope_rec <- coef(fit_rec)[2]
Rec_Reg <- list("Linear Model"=fit_rec)
modelsummary(Rec_Reg,
             coef_map = c("percentage_change"="Greatest Negative Percentage Change", "mean_within_150pct"="Within 150% of Minimum Wage"),
             stars=T, gof_map = c("nobs", "r.squared"),
             title = "Percentage Change of Employment based on being within 150% of Minimum Wage during the 2007-2010 Recession",
             output="default")
Percentage Change of Employment based on being within 150% of Minimum Wage during the 2007-2010 Recession
Linear Model
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001
Within 150% of Minimum Wage 3.077
(2.305)
Num.Obs. 112
R2 0.016
ggplot_base <- ggplot(all_occ_2007_b, aes(x = mean_within_150pct, y = percentage_change, color = as.factor(occ2010))) +
  geom_point(aes(text = paste("Percent of occ2010 within 150% minimum wage:", mean_within_150pct)), size = 3) +
  geom_smooth(aes(text = paste("Slope:", round(slope_rec, 4))),
              method = "lm", se = FALSE, color = "black", linetype = "solid") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(title = "Scatter Plot of Percentage Change vs. Mean Within 150% (2007-2010)",
       x = "Mean Within 150%",
       y = "Percentage Change") +
  scale_y_reverse() +
  guides(color = "none") +
  theme_minimal()

interactive_plot <- ggplotly(ggplot_base, tooltip = "text")
interactive_plot

The regression table shown portraying the linear model shows a non statistically significant value of 3.077, and is shown as “Table 2” in the Google doc. Between all the occupations of counts over 500, we can see a large proportion reside on the left side of the 0.50 average of the binary, which causes a positive correlation with percentage change and the mean of the binary. There is a positive correlation with a slope of 3.0771. Meaning that the closer the occ2010 is to minimum wage, the smaller the percentage decrease the occupation would have seen. This is “Figure 9” in the Google Doc.

Setting Up Data for State Graphs and Tables for the 2007-2010 Recessionary Period

#### Taking occupation counts to create percentages of labor force for each year during 2007-2010

recession_counts <- org %>%
  filter(year >= 2007 & year <= 2010) %>%  # Filter for years 2007-2010
  group_by(occ2010, year) %>%  # Group by occ2010 and year
  summarise(count = n(), .groups = "drop") %>%  # Count observations
  pivot_wider(names_from = year, values_from = count, values_fill = 0)  # Pivot to wide format

recession_counts <- org %>%
  filter(year >= 2007 & year <= 2010) %>%  # Filter for years 2007-2010
  group_by(occ2010, year) %>%  # Group by occ2010 and year
  summarise(count = n(), .groups = "drop") %>%  # Count observations
  pivot_wider(names_from = year, values_from = count, values_fill = 0)  # Pivot to wide format
  recession_counts$occ2010 <- ifelse(recession_counts$occ2010 == 9999, NA, recession_counts$occ2010) # Filter out people with no occupation values
  
# Filter out the N/A from occ2010 == 9999
  recession_counts <- recession_counts %>%
    filter(!is.na(occ2010))


total_recession <- recession_counts %>%
  summarise(across(starts_with("20"), sum, na.rm = TRUE))

recession_counts <- recession_counts %>%
  mutate(
    percent_2007 = (`2007` / total_recession$`2007`) * 100,
    percent_2008 = (`2008` / total_recession$`2008`) * 100,
    percent_2009 = (`2009` / total_recession$`2009`) * 100,
    percent_2010 = (`2010` / total_recession$`2010`) * 100
  )

recession_percent_filt <- recession_counts %>%
  filter(occ2010 %in% c(4010, 3800, 4920, 8220, 5810, 6230, 6420, 8740, 7340, 9100)) # Filter for variables of interest

# Take the state level during recession


state_rec_counts <- org_minwage %>%
  filter(year >= 2007 & year <= 2010) %>%  # Filter for years 2007-2010
  group_by(occ2010, year, state) %>%  # Group by occ2010 and year
  summarise(count = n(), .groups = "drop") %>%  # Count observations
  pivot_wider(names_from = year, values_from = count, values_fill = 0)  # Pivot to wide format
  

# Filter out the N/A from occ2010 == 9999
state_rec_counts <- state_rec_counts %>%
  filter(!is.na(occ2010))


state_total_rec <- state_rec_counts %>%
  group_by(state) %>%  # Group by state
  summarise(across(starts_with("20"), sum, na.rm = TRUE), .groups = "drop")

state_rec <- state_total_rec %>%
  group_by(state) %>%
  summarise(across(starts_with("20"), sum, na.rm = TRUE))

state_rec_perc <- state_rec_counts %>%
  mutate(
    percent_2007 = (`2007` / state_rec$`2007`) * 100,
    percent_2008 = (`2008` / state_rec$`2008`) * 100,
    percent_2009 = (`2009` / state_rec$`2009`) * 100,
    percent_2010 = (`2010` / state_rec$`2010`) * 100
  )

state_rec_per_filt <- state_rec_perc %>%
  filter(occ2010 %in% c(4010, 3800, 4920, 8220, 5810, 6230, 6420, 8740, 7340, 9100)) 

long_rec <- state_rec_per_filt %>%
  pivot_longer(
    cols = c("2007", "2008", "2009", "2010", "percent_2007", "percent_2008", "percent_2009", "percent_2010"), 
    names_to = "key", 
    values_to = "value"
  ) %>%
  mutate(
    year = case_when(
      key %in% c("2007", "2008", "2009", "2010") ~ key,
      key %in% c("percent_2007", "percent_2008", "percent_2009", "percent_2010") ~ sub("percent_", "", key)
    ),
    variable = case_when(
      key %in% c("2007", "2008", "2009", "2010") ~ "counts",
      key %in% c("percent_2007", "percent_2008", "percent_2009", "percent_2010") ~ "percent_employment"
    )
  ) %>%
  select(-key) %>%
  pivot_wider(names_from = variable, values_from = value)


long_rec <- long_rec %>%
  mutate(year = as.numeric(year))

mintab <- mintab %>%
  mutate(year = as.numeric(year))

mintab_subset <- mintab %>% select(state, year, minwageU)

# Perform a left join to merge the selected column into long_rec
long_rec <- long_rec %>%
  left_join(mintab_subset, by = c("year", "state"))

org_minwage <- org_minwage %>%
  mutate(year = as.numeric(year))

orgminwage_subset <- org_minwage %>% select(state, year, occ2010, mean_hourwage, statefip)

long_rec <- long_rec %>%
  left_join(orgminwage_subset, by = c("year", "state", "occ2010")) %>%
  distinct()

long_rec <- long_rec %>%
  mutate(
    within_150pct = ifelse(mean_hourwage <= 1.5 * minwageU, 1, 0)  # Create binary variable
  )

binary_rec <- long_rec %>%
  filter(within_150pct == 1) %>%  # Filter rows where the binary variable is 1
  group_by(occ2010, year) %>%  # Group by occ2010
  summarise(count = n(), .groups = "drop")  # Count occurrences for each occ2010


binary_total_rec <- binary_rec %>%
  group_by(occ2010) %>%  # Group by occ2010
  summarise(total_count = sum(count), .groups = "drop")


rec_per_loss_y_over_y_subset <- rec_per_loss_y_over_y %>% select(year, occ2010, percentage_change)

long_rec <- long_rec %>%
  left_join(rec_per_loss_y_over_y_subset, by = c("year", "occ2010"))

long_rec <- long_rec %>%
  filter(!is.na(statefip))


states_strongest_rec <- long_rec %>%
  select(occ2010, year, percent_employment, state) %>% 
  left_join(all_occ_2007_b, by = c("occ2010")) %>%
  mutate(employment_within_150 = percent_employment * mean_within_150pct) %>%
  select(-percentage_change)

write_xlsx(states_strongest_rec, "states_strongest_rec.xlsx")

Throughout this chunk of code, loads of data manipulation is taking place. To give a short hand explanation, the goal is create state variables of job loss year over year during the period of interest 2007-2010. By creating percentages I can also view which occupations were the highest proportionately be each state and year. This will help me set up data to create a table to demonstrate which states were proportionately hit the hardest in certain occ2010 values due to the recessionary periods.


Top 10 Most Affected Percent Changes for 2007-2010

states_per_change_2007 <- read_excel("employment_diff_results_rec.xlsx")

lowest_diff_2007 <- states_per_change_2007 %>%
  arrange(employment_diff) %>%    # Sort by employment_diff ascending
  head(10)                        # Take the top 10 rows

# Create a styled table
lowest_diff_2007 %>%
  kable("html", caption = "Top 10 Lowest Employment Diff Values") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE)
Top 10 Lowest Employment Diff Values
occ2010 year percent_employment state mean_within_150pct employment_within_150 employment_diff
4920 2009 1.0167464 Utah 0.3780220 0.3843525 -0.3251316
6230 2009 3.1952663 Florida 0.1165025 0.3722566 -0.2735587
4920 2009 1.3579049 Maryland 0.3780220 0.5133179 -0.2663049
4920 2010 0.3558719 Utah 0.3780220 0.1345274 -0.2498251
4920 2008 1.0224274 Nevada 0.3780220 0.3865000 -0.2225721
4920 2009 2.2380291 California 0.3780220 0.8460242 -0.2219835
4920 2010 2.5787966 Florida 0.3780220 0.9748418 -0.2129728
5810 2009 1.2874155 California 0.2327531 0.2996499 -0.2060117
4010 2009 0.6300630 Nevada 0.5446339 0.3431537 -0.2002988
4010 2009 0.5684627 Ohio 0.5446339 0.3096040 -0.1978614

Here I create a table with the mutated data from the block above this one. The percent changes are calculated in Python for ease of calculations and then are read right back into R. From the table produced, we can see that there is a trend in which the occ2010 code 4920 takes 6/10 spots on the table. The code 4920 is associated with housing brokers, and due to the financial crisis of the housing market bubble popping this makes sense. Many housing brokers during this period lost their jobs due to the market failure. Utah is on this table twice for the same occ2010 codes for back to back years in 2009 and 2010, displaying that the housing market took a large hit for multiple years in Utah.

This is labeled as “Table 3,” in the Google Doc Write up.


####State Graphs for the 2007-2010 Recession Period

state_mapping <- data.frame(
  statefip = c(1, 2, 4, 5, 6, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 
               20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 
               36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 
               53, 54, 55, 56),
  state_name = c(
    "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", 
    "Connecticut", "Delaware", "District of Columbia", "Florida", "Georgia", 
    "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", 
    "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", 
    "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", 
    "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", 
    "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", 
    "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", 
    "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming"
  )
)

statefip_rec <- long_rec %>%
  select(statefip) %>%
  distinct() %>%
  mutate(statefip = as.character(statefip))
  


long_rec <- long_rec %>%
  left_join(state_mapping %>% mutate(statefip_rec = as.character(statefip)), by = "statefip")


create_state_plots1 <- function(long_rec, output_dir = "state_plots_rec") {
  # Create output directory if it doesn't exist
  full_output_dir <- file.path(getwd(), output_dir)
  
  if (!dir.exists(full_output_dir)) {
    dir.create(full_output_dir)
  }
  
  # Ensure statefip is a character in long_covid (if necessary)
  long_rec <- long_rec %>% mutate(statefip = as.character(statefip))
  
  # Get unique statefip codes
  statefips <- unique(long_rec$statefip)
  
  # Iterate over each statefip and create a plot
  for (statefip_code in statefips) {
    # Filter data for the specific statefip
    state_data <- long_rec %>% filter(statefip == statefip_code)
    
    # Get the state name corresponding to the statefip
    state_name <- unique(state_data$state_name)
    
    # Check for any duplicate rows
    if (nrow(state_data) > 0) {
      
      # Generate the ggplot for each state
      p <- ggplot(state_data, aes(x = year, y = percent_employment, fill = as.factor(occ2010))) +
        geom_bar(stat = "identity", position = "dodge") +
        labs(
          title = paste("Percent Employment in State", state_name),
          x = "Year",
          y = "Percent Employment",
          fill = "Occupation"
        ) +
        theme_minimal() +
        theme(
          axis.text.x = element_text(angle = 45, hjust = 1),
          strip.text = element_text(size = 8)
        )
      
      # Save the plot to the output directory with statefip-specific file names
      ggsave(filename = file.path(full_output_dir, paste0("state_", statefip_code, ".png")), plot = p, width = 10, height = 6)
    } else {
      warning(paste("No data found for statefip:", statefip_code))
    }
  }
}


create_state_plots1(long_rec, output_dir = "state_plots_rec")

This code creates individual maps for each state that is in the dataset by their statefip codes. The graphs depict the percent employment of the occupations of interest that were chosen earlier on for the 2007-2010 recession period. This graphs provide a visual representation for anyone who is curious to see which states had the highest percent employment for these occupations (spoiler one of them is California). The way these ratios are calculated comes from earlier code which takes the total count of occ2010 values by state and year and divides the occ2010 variables of interest by the total count to give a percentage. The function created iterates with an increase in the statefip every time until reaching the final at statefip = 56 (Wyoming) and then outputs the graphs into a new folder in the working directory. (To view these graphs the code will have to be ran on your own).

The Utah graph was used in the Google Doc write up and was labeled “Figure 10.”


Data Manipulation for the COVID-19 Recession

#### COVID-19 recession



covid_period <- data %>%
  filter(year >= 2019 & year <= 2022) %>%
  filter(count > 500)

covid_per_loss <- covid_period %>%
  filter(year >= 2019 & year <= 2022) %>%
  filter(percentage_change <= -13.83)

covid_loss <- data %>%
  filter(occ2010 %in% c(4030, 2200, 3500, 5940, 7200, 4720, 3650, 4110, 4010, 4600)) %>%
  filter(year >= 2019 & year <= 2022)
  #filter(occ2010 %in% c(800, 2200, 3500, 3650, 3930, 4010, 4030, 4110, 4230, 4600, 4720, 5940, 6420, 6050, 5200, 7200, 7700, 8965, 9600, 4250)) %>%
  
covid_per_loss_y_over_y <- covid_period %>%
  filter(occ2010 %in% c(4030, 2200, 3500, 5940, 7200, 4720, 3650, 4110, 4010, 4600)) %>%
  filter(year >= 2019 & year <= 2022)

Here the data is manipulated for the COVID-19 period, 2019-2022. The data is filtered the same way by taking occ2010 with counts over 500 and then hand picking the top 10 highest percentage_changes on the negative side.


Bar Charts for COVID-19 Pandemic

ggplot(covid_loss, aes(x = factor(year), y = percentage_change, fill = factor(occ2010))) +
  geom_bar(stat = "identity", position = "dodge") +  # Bar chart with dodge positioning
  labs(
    title = "Percentage Change by occ2010 Over Years",
    x = "Years",
    y = "Percentage Change",
    fill = "occ2010"
  ) +
  theme_minimal() +  # Use a clean theme
  theme(
    plot.title = element_text(hjust = 0.5, size = 16),
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 14)
  )

avg_perc_covid <- covid_loss %>%
  group_by(occ2010) %>%               # Group by occ2010 and year
  summarize(avg_percentage_change = mean(percentage_change))# Compute the mean, handling NAs

ggplot(avg_perc_covid, aes(x = factor(occ2010), y = avg_percentage_change, fill = factor(occ2010))) +
  geom_bar(stat = "identity", position = "dodge") +  # Bar chart with dodge positioning
  labs(
    title = "Average Percentage Change by occ2010\nOver COVID-19 Period (2019-2022)",
    x = "occ2010",
    y = "Average Percentage Change",
    fill = "occ2010"
  ) +
  theme_minimal() +  # Use a clean theme
  theme(
    plot.title = element_text(hjust = 0.5, size = 16),
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 14)
  )

occ_lookup_covid <- data.frame(
  occ2010 = c(2200, 3500, 3650, 4010, 4030, 4110, 4600, 4720, 5940, 7200),
  occupation_name = c("Postsecondary Teachers", "Licensed Practical and Licensed Vocational Nurses", "Medical Assistants and Other Healthcare Support Occupations", "First-Line Supervisors of Food Preparation and Serving Workers", "Food Preparation Workers", "Waiters and Waitresses", "Childcare Workers", "Cashiers", "Office and administrative support workers (Farming, Fishing, and Forestry)", "Automotive Service Technicians and Mechanics")
)

occ_table_covid <- occ_lookup_covid %>%
  select(occ2010, occupation_name) %>%     # Select relevant columns
  arrange(occ2010)                          # Sort by occ2010 if desired

# Create a styled HTML table
occ_table_covid %>%
  kable("html", caption = "Occupation Codes and Names for 2019-2022 Recession") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE)
Occupation Codes and Names for 2019-2022 Recession
occ2010 occupation_name
2200 Postsecondary Teachers
3500 Licensed Practical and Licensed Vocational Nurses
3650 Medical Assistants and Other Healthcare Support Occupations
4010 First-Line Supervisors of Food Preparation and Serving Workers
4030 Food Preparation Workers
4110 Waiters and Waitresses
4600 Childcare Workers
4720 Cashiers
5940 Office and administrative support workers (Farming, Fishing, and Forestry)
7200 Automotive Service Technicians and Mechanics

Here the same pattern of bar charts are created as seen during the recessionary period. The first graph displays the changes year over year for the occupations of interest. While the second graph shows the average between the 4 years and the occ2010 percentage changes.

The graph labeled, “Percentage Change by occ2010 Over Years” takes the percentage change of the occ2010 codes of interest and shows the percentage changes year over year for the years of interest. This is labeled as “Figure 13” in the Google Doc.

The graph labeled, “Average Percentage Change by occ2010 Over COVID-19 Period (2019-2022)” takes the average percentage change of the occ2010 codes of interest and shows the percentage changes over the years of interest. This is labeled as “Figure 14” in the Google Doc.

The table labeled, “Occupation Codes and Names for 2019-2022 Recession” provides the occupation codes of interest during the COVID-19 crisis, and is labeled as “Figue 15” in the Google Doc.


Data Manipulation for COVID-19 Scatter Plots

### Scatter Plot for Covid to Display Minimum Wage Along Percentage Decrease of occ2010

org_minwage_2019 <- org_minwage%>%
  group_by(year, occ2010, statecd) %>%
  filter(occ2010 %in% c(4030, 2200, 3500, 5940, 7200, 4720, 3650, 4110, 4010, 4600)) %>%
  filter(year >= 2019 & year <= 2022) %>%
  mutate(
    within_150pct = ifelse(hourwage <= 1.5 * minwageU, 1, 0)  # Create binary variable
  ) %>%
  select(-cpsid,-month,-pernum,-wtfinl,-cpsidp,-earnweek2,-hourwage2,-age,-sex,-race,-nativity,-hispan,-empstat,-ind1990,-educ,-schlcoll,-earnweek,-earnwt,-educ_level) %>%
  filter(!is.na(within_150pct)) %>%
  filter(!is.na(hourwage))

org_minwage_2019_grouped <- org_minwage_2019 %>%
  group_by(occ2010) %>%  # Group by occ2010
  summarise(mean_within_150pct = mean(within_150pct, na.rm = TRUE), .groups = "drop")  # Calculate the mean

org_minwage_2019_plot <- org_minwage_2019_grouped %>%
  left_join(covid_per_loss, by = "occ2010")

The dataframes are joined with one another following the same rules as the recessionary periods.


ggplot_base <- ggplot(org_minwage_2019_plot, aes(x = mean_within_150pct, y = percentage_change, color = as.factor(occ2010))) +
  geom_point(aes(text = paste("Percent of occ2010 within 150% minimum wage:", mean_within_150pct)), size = 3) +  # Adds hover text
  labs(
    title = "Scatter Plot of Percentage Change vs. Mean Within 150%\n(COVID-19 Period)",
    x = "Mean Within 150%",
    y = "Percentage Change",
    color = "Occupation (occ2010)"
  ) +
  scale_y_reverse() +
  theme_minimal()

# Convert to interactive plotly plot
interactive_plot <- ggplotly(ggplot_base, tooltip = "text")  # Tooltip displays the text defined in aes
# Display the plot
interactive_plot
fit_specific_covid <- lm(percentage_change ~ mean_within_150pct, data = org_minwage_2019_plot)
slope_specific_covid <- coef(fit_specific_covid)[2]
Specific_Covid_Reg <- list("Linear Model"=fit_specific_covid)
modelsummary(Specific_Covid_Reg,
             coef_map = c("percentage_change"="Greatest Negative Percentage Change", "mean_within_150pct"="Within 150% of Minimum Wage"),
             stars=T, gof_map = c("nobs", "r.squared"),
             title = "Percentage Change of Employment based on being within 150% of Minimum Wage during the 2019-2022 Recession for only Occupations of Interest",
             output="default")
Percentage Change of Employment based on being within 150% of Minimum Wage during the 2019-2022 Recession for only Occupations of Interest
Linear Model
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001
Within 150% of Minimum Wage 0.024
(3.787)
Num.Obs. 11
R2 0.000
ggplot_base <- ggplot(org_minwage_2019_plot, aes(x = mean_within_150pct, y = percentage_change, color = as.factor(occ2010))) +
  geom_point(aes(text = paste("Percent of occ2010 within 150% minimum wage:", mean_within_150pct)), size = 3) +
  geom_smooth(aes(text = paste("Slope:", round(slope_specific_covid, 4))),
              method = "lm", se = FALSE, color = "black", linetype = "solid") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(title = "Scatter Plot of Percentage Change vs. Mean Within 150% (2019-2022) for occ2010 of Interest",
       x = "Mean Within 150%",
       y = "Percentage Change") +
  scale_y_reverse() +
  guides(color = "none") +
  theme_minimal()

interactive_plot <- ggplotly(ggplot_base, tooltip = "text")
interactive_plot

The graphs created here show the occupations of interest that were used within the bar chart graphs. The y-axis on the first graph is spread out to give a better visualization of the differences between the data. The second graph contains the regression line which has a slope of 0.024 indicating a positive relationship between a growing percentage change as the occ2010 is closer to the minimum wage. However, the coefficient is very small and not statistically significant.

The graph labeled, “Scatter Plot of Percentage Change vs. Mean Within 150% (COVID-19 Period)” displays the highest negative percentage change on the y-axis, and shows the percent of the occupation total within the minimum wage threshold of low wage on the x-axis, this is labeled as “Figure 16” on the Google Doc.

The graph labeled, “Scatter Plot of Percentage Change vs. Mean Within 150% (2019-2022) for occ2010 of Interest” displays the highest negative percentage change on the y-axis, and shows the percent of the occupation total within the minimum wage threshold of low wage on the x-axis, the regression line shows the best fit based on the graph. This is labeled as “Figure 17” on the Google Doc.

The table labeled, “Percentage Change of Employment based on being within 150% of Minimum Wage during the 2019-2022 Recession for only Occupations of Interest” displays the regression results which is used to fit the line for Figure 17. The slope is not statistically significant and has too low of an R^2 to be considered significant. This is labeled as “Table 4” in the Google Doc.


Data Manipulation for Scatter Plots for all occ2010 During COVID-19 Recession

all_occ_2019 <- org_minwage %>%
  group_by(year, occ2010, statecd) %>%
  filter(year >= 2019 & year <= 2022) %>%
  mutate(
    within_150pct = ifelse(hourwage <= 1.5 * minwageU, 1, 0)  # Create binary variable
  ) %>%
  select(-cpsid,-month,-pernum,-wtfinl,-cpsidp,-earnweek2,-hourwage2,-age,-sex,-race,-nativity,-hispan,-empstat,-ind1990,-educ,-schlcoll,-earnweek,-earnwt,-educ_level) %>%
  filter(!is.na(within_150pct)) %>%
  filter(!is.na(hourwage))

all_occ_2019_a <- all_occ_2019 %>%
  group_by(occ2010, year)  # Group by occ2010

all_occ_2019_a$occ2010 <- ifelse(all_occ_2019_a$occ2010 == 9999.99, NA, all_occ_2019_a$occ2010)


all_occ_2019_binary <- all_occ_2019 %>%
  group_by(occ2010) %>%
  summarise(mean_within_150pct = mean(within_150pct, na.rm = TRUE), .groups = "drop")  # Calculate the mean

total_per_change_rec <- data %>%
  filter(year >= 2019 & year <= 2022) %>%  # Filter data by year range first
  filter(count >= 500) %>%
  group_by(occ2010) %>%             # Group by year and occ2010
  filter(percentage_change == min(percentage_change, na.rm = TRUE)) %>%  # Keep rows with the smallest percentage_change
  ungroup()

all_occ_2019_b <- all_occ_2019_binary %>%
  left_join(total_per_change_rec %>% select(occ2010, percentage_change), 
            by = c("occ2010")) %>%
  filter(!is.na(percentage_change))

The data is manipulated to encompass the COVID-19 recession and for counts for 500. The binary variable is again created and the average is taken and set with the occ2010 value to give a percentage of the occupation observations that were closest to the minimum wage threshold of 150%.


Graph of occ2010 Variables from COVID-19 Recession

fit_covid <- lm(percentage_change ~ mean_within_150pct, data = all_occ_2019_b)
slope_covid <- coef(fit_covid)[2]
Covid_Reg <- list("Linear Model"=fit_covid)
modelsummary(Covid_Reg,
             coef_map = c("percentage_change"="Greatest Negative Percentage Change", "mean_within_150pct"="Within 150% of Minimum Wage"),
             stars=T, gof_map = c("nobs", "r.squared"),
             title = "Percentage Change of Employment based on being within 150% of Minimum Wage during the 2019-2022 Recession",
             output="default")
Percentage Change of Employment based on being within 150% of Minimum Wage during the 2019-2022 Recession
Linear Model
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001
Within 150% of Minimum Wage -8.753*
(3.334)
Num.Obs. 93
R2 0.070
ggplot_base <- ggplot(all_occ_2019_b, aes(x = mean_within_150pct, y = percentage_change, color = as.factor(occ2010))) +
  geom_point(aes(text = paste("Percent of occ2010 within 150% minimum wage:", mean_within_150pct)), size = 3) +
  geom_smooth(aes(text = paste("Slope:", round(slope_covid, 4))),
              method = "lm", se = FALSE, color = "black", linetype = "solid") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(title = "Scatter Plot of Percentage Change vs. Mean Within 150% (2019-2022)",
       x = "Mean Within 150%",
       y = "Percentage Change") +
  scale_y_reverse() +
  guides(color = "none") +
  theme_minimal()

interactive_plot <- ggplotly(ggplot_base, tooltip = "text")
interactive_plot

The graph is displayed for the total amount of occ2010 that had a count over 500 during this time frame of 2019-2022. The COVID-19 recession has a slope of -8.7534 and is statistically significant which is shown in the linear model regression. This means that as the occ2010 gets closer to a 100% value for being near the minimum wage, these jobs experienced a higher percentage change loss than jobs that were not near the minimum wage threshold. This is labeled as “Figure 18” in the Google Doc.

The table labeled, “Percentage Change of Employment based on being within 150% of Minimum Wage during the 2019-2022 Recession” displays the results for the regression data that is plotted in Figure 18. The line is statistically significant with the highest R^2 value displayed throughout all the tables. This table is labeled as “Table 5.”



Data Manipulation for COVID-19 Reccession Period State Graphs and Table

#### Taking occupation counts to create percentages of labor force for each year during 2019-2022


national_covid_counts <- org %>%
  filter(year >= 2019 & year <= 2022) %>%  # Filter for years 2007-2010
  group_by(occ2010, year) %>%  # Group by occ2010 and year
  summarise(count = n(), .groups = "drop") %>%  # Count observations
  pivot_wider(names_from = year, values_from = count, values_fill = 0)  # Pivot to wide format
national_covid_counts$occ2010 <- ifelse(national_covid_counts$occ2010 == 9999, NA, national_covid_counts$occ2010) # Filter out people with no occupation values

# Filter out the N/A from occ2010 == 9999
national_covid_counts <- national_covid_counts %>%
  filter(!is.na(occ2010))


national_total_covid <- national_covid_counts %>%
  summarise(across(starts_with("20"), sum, na.rm = TRUE))

national_covid_counts <- national_covid_counts %>%
  mutate(
    percent_2019 = (`2019` / national_total_covid$`2019`) * 100,
    percent_2020 = (`2020` / national_total_covid$`2020`) * 100,
    percent_2021 = (`2021` / national_total_covid$`2021`) * 100,
    percent_2022 = (`2022` / national_total_covid$`2022`) * 100
  )


national_covid_percent_filt <- national_covid_counts %>%
  filter(occ2010 %in% c(4030, 2200, 3500, 5940, 7200, 4720, 3650, 4110, 4010, 4600)) # Filter for variables of interest


# Take total by states, years for occupation to show state based labor participation during COVID-19

state_covid_counts <- org_minwage %>%
  filter(year >= 2019 & year <= 2022) %>%  # Filter for years 2007-2010
  group_by(occ2010, year, state) %>%  # Group by occ2010 and year: , minwageU, mean_hourwage
  summarise(count = n(), .groups = "drop") %>%  # Count observations
  pivot_wider(names_from = year, values_from = count, values_fill = 0)  # Pivot to wide format

state_covid_counts <- state_covid_counts %>%
  filter(!is.na(occ2010))

state_total_covid <- state_covid_counts %>%
  group_by(state) %>%  # Group by state
  summarise(across(starts_with("20"), sum, na.rm = TRUE), .groups = "drop")

state_covid <- state_total_covid %>%
  group_by(state) %>%
  summarise(across(starts_with("20"), sum, na.rm = TRUE))

state_covid_perc <- state_covid_counts %>%
  mutate(
    percent_2019 = (`2019` / state_covid$`2019`) * 100,
    percent_2020 = (`2020` / state_covid$`2020`) * 100,
    percent_2021 = (`2021` / state_covid$`2021`) * 100,
    percent_2022 = (`2022` / state_covid$`2022`) * 100
  )

state_covid_per_filt <- state_covid_perc %>%
  filter(occ2010 %in% c(4030, 2200, 3500, 5940, 7200, 4720, 3650, 4110, 4010, 4600)) #%>%  # Filter for specific occ2010


long_covid <- state_covid_per_filt %>%
  pivot_longer(
    cols = c("2019", "2020", "2021", "2022", "percent_2019", "percent_2020", "percent_2021", "percent_2022"), 
    names_to = "key", 
    values_to = "value"
  ) %>%
  mutate(
    year = case_when(
      key %in% c("2019", "2020", "2021", "2022") ~ key,
      key %in% c("percent_2019", "percent_2020", "percent_2021", "percent_2022") ~ sub("percent_", "", key)
    ),
    variable = case_when(
      key %in% c("2019", "2020", "2021", "2022") ~ "counts",
      key %in% c("percent_2019", "percent_2020", "percent_2021", "percent_2022") ~ "percent_employment"
    )
  ) %>%
  select(-key) %>%
  pivot_wider(names_from = variable, values_from = value)


long_covid <- long_covid %>%
  mutate(year = as.numeric(year))

mintab <- mintab %>%
  mutate(year = as.numeric(year))

mintab_subset <- mintab %>% select(state, year, minwageU)

# Perform a left join to merge the selected column into df1
long_covid <- long_covid %>%
  left_join(mintab_subset, by = c("year", "state"))

org_minwage <- org_minwage %>%
  mutate(year = as.numeric(year))

orgminwage_subset <- org_minwage %>% select(state, year, occ2010, mean_hourwage, statefip)

long_covid <- long_covid %>%
  left_join(orgminwage_subset, by = c("year", "state", "occ2010")) %>%
  distinct()

 
long_covid <- long_covid %>%
  mutate(
  within_150pct = ifelse(mean_hourwage <= 1.5 * minwageU, 1, 0)  # Create binary variable
  )
  
binary_covid <- long_covid %>%
  filter(within_150pct == 1) %>%  # Filter rows where the binary variable is 1
  group_by(occ2010, year) %>%  # Group by occ2010
  summarise(count = n(), .groups = "drop")  # Count occurrences for each occ2010
  
binary_total_covid <- binary_covid %>%
  group_by(occ2010) %>%  # Group by occ2010
  summarise(total_count = sum(count), .groups = "drop")

covid_per_loss_y_over_y_subset <- covid_per_loss_y_over_y %>% select(year, occ2010, percentage_change)
  
long_covid <- long_covid %>%
    left_join(covid_per_loss_y_over_y_subset, by = c("year", "occ2010"))
    
long_covid <- long_covid %>%
  filter(!is.na(statefip))
    

statefip_covid <- long_covid %>%
  select(statefip) %>%
  distinct() %>%
  mutate(statefip = as.character(statefip))


long_covid <- long_covid %>%
  left_join(state_mapping %>% mutate(statefip_covid = as.character(statefip)), by = "statefip")

Throughout this chunk of code, loads of data manipulation is taking place. To give a short hand explanation, the goal is create state variables of job loss year over year during the period of interest 2019-2022. By creating percentages I can also view which occupations were the highest proportionately be each state and year. This will help me set up data to create a table to demonstrate which states were proportionately hit the hardest in certain occ2010 values due to the recessionary periods.

This is labeled as “Table 6,” in the Google Doc Write up.


####Table of States that Saw the Largest Decreases in occ2010 Variables of Interest

states_strongest_covid <- long_covid %>%
  select(occ2010, year, percent_employment, state) %>% 
  left_join(all_occ_2019_b, by = c("occ2010")) %>%
  mutate(employment_within_150 = percent_employment * mean_within_150pct) %>%
  select(-percentage_change)

write_xlsx(states_strongest_covid, "states_strongest_covid.xlsx")

states_per_change_2019 <- read_excel("employment_diff_results_covid.xlsx")

lowest_diff_2019 <- states_per_change_2019 %>%
  arrange(employment_diff) %>%    # Sort by employment_diff ascending
  head(10)                        # Take the top 10 rows

# Create a styled table
lowest_diff_2019 %>%
  kable("html", caption = "Top 10 Lowest Employment Diff Values") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE)
Top 10 Lowest Employment Diff Values
occ2010 year percent_employment state mean_within_150pct employment_within_150 employment_diff
4720 2020 10.580645 California 0.6937201 7.3400064 -2.7591632
4030 2020 4.922725 California 0.6842271 3.3682618 -1.6726315
4720 2020 6.482364 Florida 0.6937201 4.4969465 -1.1009686
4720 2022 5.233645 Alabama 0.6937201 3.6306847 -0.9981237
4720 2020 1.643836 Maryland 0.6937201 1.1403618 -0.9915377
4720 2022 2.475248 Maryland 0.6937201 1.7171290 -0.9856767
4720 2020 1.813685 New Mexico 0.6937201 1.2581898 -0.8677267
4110 2020 1.262433 Tennessee 0.7857523 0.9919596 -0.8440442
4720 2020 8.879392 Texas 0.6937201 6.1598130 -0.8391437
4720 2021 3.234237 New York 0.6937201 2.2436554 -0.8390301

Here I create a table with the mutated data from the block above this one. The percent changes are calculated in Python for ease of calculations and then are read right back into R. From the table produced, we can see that California and Florida hold the top 3 spots for seeing the strongest decrease in 2 occupation groups, 4720 and 4030 during the COVID-19 recession. These two states also happen to be 2 of the most populated states. A common trend throughout the table is that the occupation 4720 (cashiers) saw the largest decrease in 8/10 observations. The employment_within_150 is calculated by the percent_employment multiplied by the mean_within_150pct to give a value of the percent of employment for that occ2010 observation that is within the minimum wage range.


####COVID-19 State Plots of Percentage Employment

create_state_plots <- function(long_covid, output_dir = "state_plots_covid") {
  # Create output directory if it doesn't exist
  full_output_dir <- file.path(getwd(), output_dir)
  
  if (!dir.exists(full_output_dir)) {
    dir.create(full_output_dir)
  }
  
  # Ensure statefip is a character in long_covid (if necessary)
  long_covid <- long_covid %>% mutate(statefip = as.character(statefip))
  
  # Get unique statefip codes
  statefips <- unique(long_covid$statefip)
  
  # Iterate over each statefip and create a plot
  for (statefip_code in statefips) {
    # Filter data for the specific statefip
    state_data <- long_covid %>% filter(statefip == statefip_code)
    
    # Get the state name corresponding to the statefip
    state_name <- unique(state_data$state_name)
    
    # Check for any duplicate rows
    if (nrow(state_data) > 0) {
      
      # Generate the ggplot for each state
      p <- ggplot(state_data, aes(x = year, y = percent_employment, fill = as.factor(occ2010))) +
        geom_bar(stat = "identity", position = "dodge") +
        labs(
          title = paste("Percent Employment in State", state_name),
          x = "Year",
          y = "Percent Employment",
          fill = "Occupation"
        ) +
        theme_minimal() +
        theme(
          axis.text.x = element_text(angle = 45, hjust = 1),
          strip.text = element_text(size = 8)
        )
      
      # Save the plot to the output directory with statefip-specific file names
      ggsave(filename = file.path(full_output_dir, paste0("state_", statefip_code, ".png")), plot = p, width = 10, height = 6)
    } else {
      warning(paste("No data found for statefip:", statefip_code))
    }
  }
}


create_state_plots(long_covid, output_dir = "state_plots_covid")

This code creates individual maps for each state that is in the dataset by their statefip codes. The graphs depict the percent employment of the occupations of interest that were chosen earlier on for the 2019-2022 recession period. This graphs provide a visual representation for anyone who is curious to see which states had the highest percent employment for these occupations (spoiler one of them is California). The way these ratios are calculated comes from earlier code which takes the total count of occ2010 values by state and year and divides the occ2010 variables of interest by the total count to give a percentage. The function created iterates with an increase in the statefip every time until reaching the final at statefip = 56 (Wyoming) and then outputs the graphs into a new folder in the working directory. (To view these graphs the code will have to be ran on your own).

The California graph was used in the Google Doc write up and was labeled “Figure 19.”